{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 调用xgboost cv搜索最佳n_estimators\n",
    "### 第三步报错，一直没找到原因，请助教老师指导\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 导入模块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from xgboost import XGBClassifier\n",
    "import xgboost as xgb\n",
    "\n",
    "import pandas as pd \n",
    "import numpy as np\n",
    "\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "from sklearn.metrics import log_loss\n",
    "\n",
    "from matplotlib import pyplot\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "dpath = './data/'\n",
    "train = pd.read_csv(dpath +\"RentListingInquries_FE_train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bathrooms</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>price</th>\n",
       "      <th>price_bathrooms</th>\n",
       "      <th>price_bedrooms</th>\n",
       "      <th>room_diff</th>\n",
       "      <th>room_num</th>\n",
       "      <th>Year</th>\n",
       "      <th>Month</th>\n",
       "      <th>Day</th>\n",
       "      <th>...</th>\n",
       "      <th>walk</th>\n",
       "      <th>walls</th>\n",
       "      <th>war</th>\n",
       "      <th>washer</th>\n",
       "      <th>water</th>\n",
       "      <th>wheelchair</th>\n",
       "      <th>wifi</th>\n",
       "      <th>windows</th>\n",
       "      <th>work</th>\n",
       "      <th>interest_level</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.5</td>\n",
       "      <td>3</td>\n",
       "      <td>3000</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>750.000000</td>\n",
       "      <td>-1.5</td>\n",
       "      <td>4.5</td>\n",
       "      <td>2016</td>\n",
       "      <td>6</td>\n",
       "      <td>24</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>5465</td>\n",
       "      <td>2732.5</td>\n",
       "      <td>1821.666667</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>6</td>\n",
       "      <td>12</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2850</td>\n",
       "      <td>1425.0</td>\n",
       "      <td>1425.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>4</td>\n",
       "      <td>17</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>3275</td>\n",
       "      <td>1637.5</td>\n",
       "      <td>1637.500000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>4</td>\n",
       "      <td>18</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "      <td>4</td>\n",
       "      <td>3350</td>\n",
       "      <td>1675.0</td>\n",
       "      <td>670.000000</td>\n",
       "      <td>-3.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>4</td>\n",
       "      <td>28</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 228 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   bathrooms  bedrooms  price  price_bathrooms  price_bedrooms  room_diff  \\\n",
       "0        1.5         3   3000           1200.0      750.000000       -1.5   \n",
       "1        1.0         2   5465           2732.5     1821.666667       -1.0   \n",
       "2        1.0         1   2850           1425.0     1425.000000        0.0   \n",
       "3        1.0         1   3275           1637.5     1637.500000        0.0   \n",
       "4        1.0         4   3350           1675.0      670.000000       -3.0   \n",
       "\n",
       "   room_num  Year  Month  Day       ...        walk  walls  war  washer  \\\n",
       "0       4.5  2016      6   24       ...           0      0    0       0   \n",
       "1       3.0  2016      6   12       ...           0      0    0       0   \n",
       "2       2.0  2016      4   17       ...           0      0    0       0   \n",
       "3       2.0  2016      4   18       ...           0      0    0       0   \n",
       "4       5.0  2016      4   28       ...           0      0    1       0   \n",
       "\n",
       "   water  wheelchair  wifi  windows  work  interest_level  \n",
       "0      0           0     0        0     0               1  \n",
       "1      0           0     0        0     0               2  \n",
       "2      0           0     0        0     0               0  \n",
       "3      0           0     0        0     0               2  \n",
       "4      0           0     0        0     0               2  \n",
       "\n",
       "[5 rows x 228 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = train['interest_level']\n",
    "X = train.drop(['interest_level'],axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "由于计算资源有限，从原始数据随机选取1/3样本作为训练数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "((16286, 227), (16286,))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.33,random_state = 666)\n",
    "X_train.shape,y_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 搜索最佳n_estimators"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def modelfit(alg, X_train, y_train, cv_folds=None, early_stopping_rounds=10):\n",
    "    xgb_param = alg.get_xgb_params()\n",
    "    xgb_param['num_class'] = 3\n",
    "\n",
    "    xgtrain = xgb.DMatrix(X_train, label = y_train)\n",
    "        \n",
    "    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds,\n",
    "             metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)\n",
    "  \n",
    "    cvresult.to_csv('1_nestimators.csv', index_label = 'n_estimators')\n",
    "    \n",
    "    n_estimators = cvresult.shape[0]\n",
    "    \n",
    "    alg.set_params(n_estimators = n_estimators)\n",
    "    alg.fit(X_train, y_train, eval_metric='mlogloss')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "kfold无法迭代，不知道什么原因，请助教老师指导。另外，课件案例原始代码运行也出现相同报错。\n",
    "为了后续调参步骤的进行，后续步骤假设最佳n_estimators为270。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "'StratifiedKFold' object is not iterable",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-12-268f98f071c8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     11\u001b[0m         seed=3)\n\u001b[0;32m     12\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 13\u001b[1;33m \u001b[0mmodelfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mxgb1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcv_folds\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkfold\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-11-5c7dc9745bf5>\u001b[0m in \u001b[0;36mmodelfit\u001b[1;34m(alg, X_train, y_train, cv_folds, early_stopping_rounds)\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m     cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds,\n\u001b[1;32m----> 8\u001b[1;33m              metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)\n\u001b[0m\u001b[0;32m      9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     10\u001b[0m     \u001b[0mcvresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'1_nestimators.csv'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex_label\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'n_estimators'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files\\Anaconda3\\lib\\site-packages\\xgboost\\training.py\u001b[0m in \u001b[0;36mcv\u001b[1;34m(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks)\u001b[0m\n\u001b[0;32m    369\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    370\u001b[0m     \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 371\u001b[1;33m     \u001b[0mcvfolds\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmknfold\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnfold\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfpreproc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstratified\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfolds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    372\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    373\u001b[0m     \u001b[1;31m# setup callbacks\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Program Files\\Anaconda3\\lib\\site-packages\\xgboost\\training.py\u001b[0m in \u001b[0;36mmknfold\u001b[1;34m(dall, nfold, param, seed, evals, fpreproc, stratified, folds)\u001b[0m\n\u001b[0;32m    236\u001b[0m         \u001b[0midset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mrandidx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mi\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mkstep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mmin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrandidx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mi\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mkstep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnfold\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    237\u001b[0m     \u001b[1;32melif\u001b[0m \u001b[0mfolds\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 238\u001b[1;33m         \u001b[0midset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mfolds\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    239\u001b[0m         \u001b[0mnfold\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0midset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    240\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: 'StratifiedKFold' object is not iterable"
     ]
    }
   ],
   "source": [
    "xgb1 = XGBClassifier(\n",
    "        learning_rate =0.1,\n",
    "        n_estimators=1000,  \n",
    "        max_depth=5,\n",
    "        min_child_weight=1,\n",
    "        gamma=0,\n",
    "        subsample=0.3,\n",
    "        colsample_bytree=0.8,\n",
    "        colsample_bylevel=0.7,\n",
    "        objective= 'multi:softprob',\n",
    "        seed=3)\n",
    "\n",
    "modelfit(xgb1, X_train, y_train, cv_folds = kfold)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
